kSanity-VIRGO metagenomics counts QC

Author

Laura Symul, Laura Vermeren

Published

May 27, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(labelled)
library(vegan)
library(RColorBrewer)

# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the data

Code
data_source <- "real"
data_dir <- get_data_dir(data_source)

if (data_source == "real") {
  mg_dir <- str_c(data_dir, "00 Raw/02 Metagenomics/")
  counts <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_ReadCounts_20250404.csv"))
  counts_corr <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_taxaGLcor_20250404.csv"))
  relabs <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_RelAbund_20250404.csv"))
  technical_metadata <- read_csv(str_c(mg_dir, "VIBRANT_MG_technicalMetaData_20250502.csv"))
  LBP_strain_info <- readxl::read_xlsx(str_c(data_dir, "00 Raw/00 Trial Data/IsolateNumbers.xlsx"))
  VIRGO2_taxonomic_table <- read_csv(str_c(mg_dir, "VIRGO2_taxonomy_key_250429.csv"))
} else {
  mg_dir <- str_c(data_dir, "03 metagenomics combined/")
  counts <- read_csv(str_c(mg_dir, "mg_combined.csv"))
  manifest <- read_csv(str_c(mg_dir, "mg_combined_manifest.csv"))
  
}

We load the following tables, all provided by Michael France:

  • technical_metadata technical metadata (dimensions: 1127 samples x 23 columns)

  • counts raw counts per LBP strain and taxa (dimensions: 966 samples x 783 columns)

  • counts_corr counts per LBP strain and taxa corrected by gene length (dimensions: 966 samples x 782 columns)

  • relabs relative abundances per LBP strain and taxa, computed by M. France from counts_corr (dimensions: 966 samples x 782 columns)

  • VIRGO2_taxonomic_table taxonomic table for the taxa as defined in VIRGO2 (dimensions: 779 taxa x 10 columns)

Manifest QC

In this section, we check, harmonize, and transform the manifest and technical metadata provided by Michael France.

Code
technical_metadata_original <- technical_metadata

Column name cleaning and description

The technical_metadata table contains the following columns:

Code
technical_metadata |> glimpse()
Rows: 1,127
Columns: 23
$ UID                           <chr> "MG_2151249", "MG_2146131", "MG_2152234"…
$ SampleNumber                  <chr> "2151249", "2146131", "2152234", "215690…
$ PID                           <chr> "68200008", "68200008", "68200008", "682…
$ VisitCode                     <chr> "0010", "1000", "1100", "1200", "1200", …
$ VisitCodeFlagged              <dbl> 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ SampleType                    <chr> "ClinicalSample", "ClinicalSample", "Cli…
$ Ext_Lib_Plate                 <dbl> 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 2, 2, 7, 7…
$ Ext_Lib_Plate_ID              <dbl> 2316094, 2316094, 2316094, 2316297, 2316…
$ Ext_Lib_Position              <chr> "B6", "A7", "D7", "G1", "G1", "G2", "G2"…
$ SequencingRun                 <chr> "250213_A00904_0437_AHVLW5DSXC", "250213…
$ DateSequenced                 <dbl> 250213, 250213, 250213, 250228, 250207, …
$ Lane                          <dbl> 3, 3, 3, 4, 3, 2, 3, 2, 3, 4, 2, 4, 3, 3…
$ Sample                        <chr> "NA19213-001", "NA19220-001", "NA19223-0…
$ `Library Pool`                <chr> "IP707", "IP707", "IP707", "IP755", "IP6…
$ Library                       <chr> "IL19213-001", "IL19220-001", "IL19223-0…
$ FragmentSize                  <dbl> 307, 352, 372, 287, 287, 364, 364, 357, …
$ IndexSet                      <chr> "N2UD-B06", "N2UD-A07", "N2UD-D07", "N1U…
$ `Index 1`                     <chr> "CTTGACGA", "TGTTCGCC", "AGTCACAT", "TAG…
$ `Index 2`                     <chr> "GCACACAA", "AGGTAGGA", "CATTATGG", "GCC…
$ BioinformaticsProcessingBatch <dbl> 5, 5, 5, 6, 1, 6, 1, 6, 1, 5, 6, 1, 5, 5…
$ `Selected4re-extraction`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ LibrarySequencedTwice         <dbl> NA, NA, NA, 1, NA, 1, NA, 1, NA, NA, 1, …
$ Notes                         <chr> NA, NA, NA, "requeue for additional cove…
Code
# checks
all(technical_metadata$UID == str_c("MG_", technical_metadata$SampleNumber))
Code
dictionary <- 
  tibble(original_name = colnames(technical_metadata)) |> 
  mutate(
    description = 
      case_when(
        (original_name == "UID") ~ '"MG_" followed by the `SampleNumber`',
        (original_name == "SampleNumber") ~ 'Vaginal swab barcode for metagenomics and qPCR',
        (original_name == "PID") ~ 'Participant ID as provided in the metagenomics manifest',
        (original_name == "VisitCode") ~ '4-digit visit code (character)',
        (original_name == "VisitCodeFlagged") ~ '1 if Michael flagged the visit code as unexpected',
        (original_name == "SampleType") ~ 'Sample type (e.g., Clinical sample, Control, etc.)',
        (original_name == "Ext_Lib_Plate") ~ 'Extraction library plate number',
        (original_name == "Ext_Lib_Plate_ID") ~ 'Extraction library plate ID',
        (original_name == "Ext_Lib_Position") ~ 'Well position of sample on the extraction library plate',
        (original_name == "SequencingRun") ~ 'Sequencing run ID',
        (original_name == "DateSequenced") ~ 'Sequencing date',
        (original_name == "Lane") ~ 'Sequencing lane',
        (original_name == "Sample") ~ 'Sequencing sample ID',
        (original_name == "Library Pool") ~ 'Library pool',
        (original_name == "Library") ~ 'Pooled library ID',
        (original_name == "FragmentSize") ~ 'Size of fragment',
        (original_name == "IndexSet") ~ 'Index set ID',
        (original_name == "Index 1") ~ 'Index 1',
        (original_name == "Index 2") ~ 'Index 2',
        (original_name == "BioinformaticsProcessingBatch") ~ 'Bioinformatics processing batch number',
        (original_name == "Selected4re-extraction") ~ '1 if sample has been selected for re-extraction by Michael (poor signal samples and a few random good signal samples)',
        (original_name == "LibrarySequencedTwice") ~ '1 if sample has been sequenced twice',
        (original_name == "Notes") ~ 'Notes from Michael France',
        TRUE ~ "????"
      )
  )

We rename and harmonize the column names of the technical metadata.

Code
technical_metadata <- 
  technical_metadata |> 
  janitor::clean_names() |>
  dplyr::rename(
    mg_uid = uid,
    mg_pid = pid,
    mg_visit_code = visit_code,
    mg_sample_type = sample_type,
    swab_barcode = sample_number,
    ext_lib_plate_nb = ext_lib_plate,
    sequencing_sample_id = sample,
    sequencing_lane = lane,
    sequencing_date = date_sequenced,
    selected_for_re_extraction = selected4re_extraction
  ) 
Code
dictionary <- 
  dictionary |>  
  mutate(name = colnames(technical_metadata)) |> 
  select(name, original_name, description)
Code
dictionary |> gt() |> 
  cols_label(
    name = "New column name",
    original_name = "Original column name",
    description = "Description"
  ) 
New column name Original column name Description
mg_uid UID "MG_" followed by the `SampleNumber`
swab_barcode SampleNumber Vaginal swab barcode for metagenomics and qPCR
mg_pid PID Participant ID as provided in the metagenomics manifest
mg_visit_code VisitCode 4-digit visit code (character)
visit_code_flagged VisitCodeFlagged 1 if Michael flagged the visit code as unexpected
mg_sample_type SampleType Sample type (e.g., Clinical sample, Control, etc.)
ext_lib_plate_nb Ext_Lib_Plate Extraction library plate number
ext_lib_plate_id Ext_Lib_Plate_ID Extraction library plate ID
ext_lib_position Ext_Lib_Position Well position of sample on the extraction library plate
sequencing_run SequencingRun Sequencing run ID
sequencing_date DateSequenced Sequencing date
sequencing_lane Lane Sequencing lane
sequencing_sample_id Sample Sequencing sample ID
library_pool Library Pool Library pool
library Library Pooled library ID
fragment_size FragmentSize Size of fragment
index_set IndexSet Index set ID
index_1 Index 1 Index 1
index_2 Index 2 Index 2
bioinformatics_processing_batch BioinformaticsProcessingBatch Bioinformatics processing batch number
selected_for_re_extraction Selected4re-extraction 1 if sample has been selected for re-extraction by Michael (poor signal samples and a few random good signal samples)
library_sequenced_twice LibrarySequencedTwice 1 if sample has been sequenced twice
notes Notes Notes from Michael France
Note

@Michael: let us know if you think that the descriptions are accurate or if some should be corrected or could be more precise :)

Sample and control type harmonization

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
    sample_type = mg_sample_type |> str_replace("lS", "l s") |> str_replace("b\\+C", "b \\+ C"),
    control_type = 
      case_when(
        sample_type == "Clinical sample" ~ NA_character_,
        TRUE ~ sample_type
      ) |> factor(),
    sample_type = 
      case_when(
        str_detect(sample_type, "Clinical") ~ "Clinical sample",
        str_detect(sample_type, "Mock") ~ "Positive control",
        TRUE ~ "Negative control"
      ) |> 
      fct_infreq()
  ) |> 
  relocate(sample_type, control_type, .after = mg_sample_type) 

technical_metadata |> dplyr::count(mg_sample_type, sample_type, control_type) |> gt()
mg_sample_type sample_type control_type n
ClinicalSample Clinical sample NA 1109
Mock 1 Positive control Mock 1 5
Mock 2 Positive control Mock 2 4
Nuclease-free water Negative control Nuclease-free water 5
Unused swab+C2 Negative control Unused swab + C2 4
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = c("sample_type", "control_type"),
      description = c("Sample type (harmonized)", "Control type (Mock 1, Mock 2, etc.)")
    )
  )

Participant ID harmonization

In the technical metadata, the participant ID is provided in three different formats.

We harmonize them to match the pattern 068[12]0[0-9]{4} (e.g., “068101234”).

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
      pid = 
        case_when(
          str_detect(mg_pid, "068-") ~ mg_pid |> str_remove_all("-"),
          str_detect(mg_pid, "^68") ~ str_c("0", mg_pid),
          TRUE ~ mg_pid
        )
    ) |> 
    relocate(pid, .before = mg_pid)
Code
technical_metadata |> 
  mutate(pid_length = str_length(pid)) |> 
  filter(pid_length != 9) |> 
  View()

There are a few samples from a participant from a different study (“CAP098”):

Code
technical_metadata |> 
  mutate(pid_length = str_length(pid)) |> 
  filter(pid_length != 9, sample_type == "Clinical sample") |> 
  select(mg_uid, swab_barcode, pid, mg_pid, mg_visit_code, ext_lib_plate_nb) |> 
  gt()
mg_uid swab_barcode pid mg_pid mg_visit_code ext_lib_plate_nb
MG_2285783 2285783 98120009 98120009 1010 5
MG_2285793 2285793 98120009 98120009 1020 5
MG_2285794 2285794 98120009 98120009 1030 5
MG_2285800 2285800 98120009 98120009 1040 5
MG_2285806 2285806 98120009 98120009 1050 5
MG_2285808 2285808 98120009 98120009 1060 5
MG_2285809 2285809 98120009 98120009 1070 5

We modify the sample_type of these samples to “Clinical sample (other study)”.

Code
technical_metadata <- 
  technical_metadata |> 
  mutate(
    sample_type = 
      ifelse(
        str_detect(pid, "^98"), 
        "Clinical sample (other study)", 
        sample_type |> as.character()
        ) |> factor()
    ) 

Visit codes are already in the desired format (i.e., a 4 digit character string), so we keep them as is.

Code
technical_metadata <-
  technical_metadata |> 
  mutate(visit_code = mg_visit_code)
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = c("pid", "visit_code"),
      description = c("Harmonized participant ID", "Harmonized visit code")
    )
  )

Samples sequenced twice

Code
samples_sequenced_twice <- 
  technical_metadata |> 
  group_by(mg_uid) |> 
  summarize(n_rows = n(), library_sequenced_twice = sum(library_sequenced_twice, na.rm = TRUE)) |> 
  arrange(-n_rows)

# samples_sequenced_twice |> dplyr::count(n_rows, library_sequenced_twice)

samples_sequenced_twice <- 
  samples_sequenced_twice |> 
  filter(n_rows > 1) 

# sum(technical_metadata$library_sequenced_twice, na.rm = TRUE) == nrow(samples_sequenced_twice)

161 samples have been sequenced twice to increase the coverage. The technical metadata contains a column library_sequenced_twice (prev. LibrarySequencedTwice) that indicates whether a sample has been sequenced twice. Since the data is merge after sequencing and before bioinformatics processing (VIRGO2 and kSANITY), we need to merge the technical metadata for these samples so they match the counts, counts_corr and relabs data.

Code
samples_sequenced_twice |> 
  gt(caption = "Samples that have been sequenced twice") 
Code
rm(samples_sequenced_twice)
Code
technical_metadata_agg <- 
    technical_metadata |> 
    group_by(mg_uid) |> 
    arrange(sequencing_date) |> 
    summarise(
      across(everything(), ~ unique(.) |> na.omit() |> str_c(collapse = "; ")),
    ) |> 
    mutate(
      ext_lib_plate_nb = ext_lib_plate_nb |> as.integer(),
      library_sequenced_twice = ifelse(library_sequenced_twice == 1, TRUE, FALSE),
      selected_for_re_extraction = ifelse(selected_for_re_extraction == 1, TRUE, FALSE)
    ) 

Matching the sequenced samples with the expected list of samples (CRF35)

We now match the technical metadata with the expected list of samples from CRF35 (Specimen collection).

Code
expected_samples <- 
  read_csv(
    str_c(
      "/Users/laurasymul/Dropbox/Academia/Projects/VIBRANT Study Files/", 
      "91_VIBRANT_specimen_collection_CRF_data/CRF35_weekly_specimen_collection.csv"
      )
    )
Code
expected_samples <- 
  expected_samples |> 
  mutate(
    in_CRF_35 = TRUE,
    visit_code = visit_code |> as.character() |> str_pad(width = 4, pad = "0", side = "left"), 
    pid = str_c("068", pid |> as.character()),
    other_specimen_collected = 
      case_when(
        (softcup == "Checked") | (endocervical_cytobrush == "Checked") ~ TRUE,
        TRUE ~ FALSE
      )
  ) 
Code
matched <- 
  dplyr::full_join(
    expected_samples |> select(pid, visit_code, in_CRF_35, self_collected_swabs, other_specimen_collected),
    technical_metadata_agg |> 
      select(pid, visit_code, sample_type, control_type, swab_barcode, mg_pid) |> 
      mutate(has_mg_data = TRUE),
    by = join_by(pid, visit_code)
  ) |> 
  dplyr::left_join(
    expected_samples |> 
      select(pid, location, randomized) |> 
      distinct() |> 
      mutate(pid_in_CRF_35 = TRUE),
    by = join_by(pid)
  ) |> 
  mutate(
    randomized = randomized |> replace_na(FALSE),
    in_CRF_35 = in_CRF_35 |> replace_na(FALSE),
    has_mg_data = has_mg_data |> replace_na(FALSE) ,
    pid_in_CRF_35 = pid_in_CRF_35 |> replace_na(FALSE),
    self_collected_swabs = self_collected_swabs |> replace_na(0)
  )
Code
matched <- 
  matched |> 
  group_by(pid, visit_code) |> 
  mutate(n_mg_rows = n()) |>
  ungroup() |> 
  mutate(
    category = 
      case_when(
        sample_type == "Clinical sample (other study)" ~ "Clinical sample from another study",
        sample_type != "Clinical sample" ~ "Not a clinical sample",
        has_mg_data & !pid_in_CRF_35 ~ "Unexpected participant ID",
        has_mg_data & !randomized ~ "Sequenced sample from non-randomized participants",
        has_mg_data & !in_CRF_35 ~ "Sequenced sample not listed in CRF35",
        !has_mg_data & in_CRF_35 & (self_collected_swabs > 0) & randomized ~ "Sample from a randomized participant not sequenced",
        !has_mg_data & in_CRF_35 & (self_collected_swabs > 0) & (!randomized) ~ "Sample from a non-randomized participant not sequenced",
        !has_mg_data & in_CRF_35 & (self_collected_swabs == 0) ~ "No swab collected according to CRF35 (and no MG data)",
        has_mg_data & in_CRF_35 & (self_collected_swabs == 0) ~ "Unexpected sequencing data (no swab collected according to CRF35)",
        has_mg_data & in_CRF_35 ~ "Expected sample",
        TRUE ~ "???"
      )
  ) 
Code
matched |> 
  mutate(randomized = ifelse(randomized, "Randomized", "Not randomized")) |> 
  dplyr::count(randomized, has_mg_data, category, name = "n samples") |> 
  arrange(randomized, has_mg_data, -`n samples`) |> 
  dplyr::rename(
    `Sample category` = category,
    `Randomized` = randomized,
  ) |>
  group_by(Randomized) |> 
  gt(row_group_as_column = TRUE) 
has_mg_data Sample category n samples
Not randomized FALSE Sample from a non-randomized participant not sequenced 271
FALSE No swab collected according to CRF35 (and no MG data) 6
TRUE Sequenced sample from non-randomized participants 74
TRUE Not a clinical sample 18
TRUE Clinical sample from another study 7
Randomized FALSE Sample from a randomized participant not sequenced 39
FALSE No swab collected according to CRF35 (and no MG data) 8
TRUE Expected sample 864
TRUE Sequenced sample not listed in CRF35 3
Code
plot_sample_categories <- function(matched){
  
  matched |> 
    mutate(
      pid = pid |> fct_infreq(), 
      randomized = ifelse(randomized, "Randomized", "Not randomized") |> factor() |> fct_rev()
    ) |> 
    ggplot() +
    aes(x = pid, y = visit_code, fill = category) +
    geom_tile() +
    facet_grid(. ~ location + randomized, scales = "free", space = "free") +
    scale_fill_manual(values = c(
      "Expected sample" = "steelblue1",
      "No swab collected according to CRF35 (and no MG data)" = "gray80",
      "Not a clinical sample" = "gray60",
      "Clinical sample from another study" = "gray40",
      "Sample from a randomized participant not sequenced" = "red",
      "Sample from a non-randomized participant not sequenced" = "red4",
      "Sequenced sample from non-randomized participants" = "steelblue3",
      "Sequenced sample not listed in CRF35" = "orange",
      "Unexpected participant ID" = "purple"
    )) +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )
  
}
Code
plot_sample_categories(matched) + ggtitle("Combined rows from the MG manifest and CRF35 data")

Code
plot_sample_categories(matched |> filter(randomized)) + 
  ggtitle('Randomized participants')

Code
matched |> 
  filter(randomized & (category == "Sample from a randomized participant not sequenced")) |> 
  select(pid, visit_code, self_collected_swabs) |> 
  arrange(-self_collected_swabs) |> 
  gt(caption = "Expected samples (collected swabs according to CRF35) with missing metagenomics data")

In addition to the samples that have not been sequenced, there is one “participant x visit” that has been sequenced twice.

This is likely a pid or visit_code mistake because that entry is not listed in CRF35.

Code
matched <- 
  matched |> 
  mutate(
    category = 
      case_when(
        (sample_type == "Clinical sample") & (n_mg_rows > 1) ~ str_c(category, " (two distinct barcodes for the same participant x visit)"),
        TRUE ~ category
      )
    ) 

matched |>
  filter((sample_type == "Clinical sample"), n_mg_rows > 1) |> 
  gt()
pid visit_code in_CRF_35 self_collected_swabs other_specimen_collected sample_type control_type swab_barcode mg_pid has_mg_data location randomized pid_in_CRF_35 n_mg_rows category
068200070 1000 FALSE 0 NA Clinical sample 2167612 68200070 TRUE SA FALSE TRUE 2 Sequenced sample from non-randomized participants (two distinct barcodes for the same participant x visit)
068200070 1000 FALSE 0 NA Clinical sample 2171048 68200070 TRUE SA FALSE TRUE 2 Sequenced sample from non-randomized participants (two distinct barcodes for the same participant x visit)
Note

@Michael: I think you said you were able to recover the correct pid and/or visit code for these swabs.

There are also some samples that were not listed in CRF35 but have been sequenced - suspecting a pid or visit_code mistake

Code
matched |> 
  filter(category == "Sequenced sample not listed in CRF35") |> 
  gt()
pid visit_code in_CRF_35 self_collected_swabs other_specimen_collected sample_type control_type swab_barcode mg_pid has_mg_data location randomized pid_in_CRF_35 n_mg_rows category
068200310 1104 FALSE 0 NA Clinical sample 2269977 68200310 TRUE SA TRUE TRUE 1 Sequenced sample not listed in CRF35
068200350 0011 FALSE 0 NA Clinical sample 2271051 68200350 TRUE SA TRUE TRUE 1 Sequenced sample not listed in CRF35
068200278 1001 FALSE 0 NA Clinical sample 2280392 68200278 TRUE SA TRUE TRUE 1 Sequenced sample not listed in CRF35

Their visit code is not a “weekly” visit code.

Note

@Michael: Should we double-check with Sinaye? It could also be that the Ravel lab got sent a few daily swabs accidentally.

We add a column to the manifest data with the sample categories defined above.

Code
technical_metadata_agg <- 
  technical_metadata_agg |> 
  dplyr::left_join(
    matched |> select(pid, visit_code, category) |> distinct() |> 
      dplyr::rename(sample_category = category)
  )

technical_metadata_agg |> dplyr::count(sample_category) |> gt(caption = "Sample categories of the sequenced samples")
Sample categories of the sequenced samples
sample_category n
Clinical sample from another study 7
Expected sample 864
Not a clinical sample 18
Sequenced sample from non-randomized participants 72
Sequenced sample from non-randomized participants (two distinct barcodes for the same participant x visit) 2
Sequenced sample not listed in CRF35 3
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = "sample_category",
      description = "Sample classification based on the data about collected specimen as documented in CRF35."
    )
  )
Code
write_csv(
  matched |> 
    dplyr::rename(entry_in_CRF35 = in_CRF_35, mg_sequenced = has_mg_data) |> 
    select(
      swab_barcode, sample_type, control_type, 
      ends_with("pid"), location, randomized, visit_code, 
      entry_in_CRF35, self_collected_swabs, other_specimen_collected, pid_in_CRF_35,
      mg_sequenced, n_mg_rows, category
      ) |> 
    mutate(
      needs_review = 
        (
          category %in% c(
            "Sample from a randomized participant not sequenced", 
            "Sequenced sample not listed in CRF35",
            "Sequenced sample from non-randomized participants (two distinct barcodes for the same participant x visit)"
            )
          )
      ), 
  file = 
    str_c(
      get_output_dir(data_source = data_source),  
      "02_mg_and_clin_data_match_", today() |> str_remove_all("-"), ".csv"
    )
)

Defining the unique cross-assay identifier

The unique cross-assay identifier (uid) is

  • the concatenation of pid and visit_code for the “expected” clinical samples.
  • the swab_barcode for the non-clinical samples and the “unexpected” clinical samples.
Code
technical_metadata_agg <- 
  technical_metadata_agg |> 
  group_by(pid, visit_code) |> mutate(i = row_number()) |> ungroup() |> 
  mutate(
    uid = 
      case_when(
        str_detect(sample_type, "Clinical sample") & (i > 1) ~ str_c(pid, "_", visit_code, "_", i),
        str_detect(sample_type, "Clinical sample") ~ str_c(pid, "_", visit_code),
        TRUE ~ swab_barcode
      ),
  ) |> 
  relocate(uid, .before = mg_uid) |> 
  select(-i)
Code
dictionary <- 
  dictionary |> 
  bind_rows(
    tibble(
      name = "uid",
      description = "Unique sample identifier - common ID across VIBRANT assays."
    )
  )

Creating a SummarizedExperiment object from these tables

The SummarizedExperiment object will contain the following assays:

  • counts
  • counts_corr
  • rel_abs

We note some negative (although extremely small) values in the counts, counts_corr, and rel_abs assays (all for native L. crispatus, I assume it being an artifact from kSANITY). We will set these to 0.

The SE colData will contain the technical metadata, the total number of reads for each sample, along with the metagenomics-based CST, subCST, and VALENCIA’s assignment scores. For samples that have been sequenced twice, we will merge the sequencing and processing information from both sequencing runs.

The SE rowData will contain the VIRGO2 taxonomic table, augmented with the LBP strain information.

The LBP strain information is loaded as LBP_strain_info from the VIBRANT Dropbox file that contains information on the LBP strains

Since the names of the strains do not match exactly those used by Michael, we modify them to match those provided by Michael.

Code
LBP_strain_info <- 
  LBP_strain_info |> 
  mutate(
    strain_id = `VMRC ID`, 
    LBP = ifelse(is.na(LC106), "LC-115", "LC-106 & LC-115") |> factor(), 
    strain_origin = `Geographic source` |> factor()
  ) |>
  dplyr::rename(Biose_ID = `Biose ID`) |> #VMRC_ID = `VMRC ID` 
  dplyr::select(strain_id, LBP, strain_origin, Biose_ID) |> 
  arrange(strain_origin, LBP) |> 
  mutate(strain_id = strain_id |> fct_inorder()) |> 
  dplyr::select(strain_id, LBP, strain_origin, contains("ID")) |> 
  mutate(
    strain_id_mg = sub("_.*", "", strain_id),
    strain_id_mg = 
      case_when(
        strain_id_mg == "CC0028A1" ~ "C0028A1", 
        TRUE ~ strain_id_mg
      )
  )

LBP_strain_info |>   
  set_variable_labels(
    strain_id = "Strain ID", 
    strain_id_mg = "Strain ID (as in metagenomics data)",
    strain_origin = "Strain origin", 
    Biose_ID = "Biose ID"
  ) |> 
  gt(caption = "LBP strain information") |> 
  tab_style(style = cell_text(weight = "bold"),
            locations = cells_column_labels())
LBP strain information
Strain ID LBP Strain origin Biose ID Strain ID (as in metagenomics data)
FF00018 LC-106 & LC-115 SA GA08 FF00018
FF00051 LC-106 & LC-115 SA GA09 FF00051
UC101_2_33 LC-106 & LC-115 SA GA12 UC101
FF00004 LC-115 SA GA07 FF00004
FF00064 LC-115 SA GA10 FF00064
FF00072 LC-115 SA GA11 FF00072
UC119_2_PB0238 LC-115 SA GA13 UC119
122010_1999_16 LC-115 SA GA14 122010
185329_1999_17 LC-115 SA GA15 185329
C0059E1 LC-106 & LC-115 US GA03 C0059E1
C0022A1 LC-106 & LC-115 US GA04 C0022A1
C0175A1 LC-106 & LC-115 US GA06 C0175A1
C0006A1 LC-115 US GA01 C0006A1
CC0028A1 LC-115 US GA02 C0028A1
C0112A1 LC-115 US GA05 C0112A1

NOTE : At the moment, the table counts includes an extra column “multiGenera”. For now, we remove the MuliGenera from the counts data frame; but in the future, Michael should provide the corrected counts and relative abundances with this extra column.

We also add the manifest column dictionary to the @metadata slot of the SE object.

Code
mg_to_SE <- function(counts, counts_corr, relabs, technical_metadata_agg, dictionary, VIRGO2_taxonomic_table, LBP_strain_info) {
  
  
  ## ASSAYS
  # remove "multiGenera" from counts # TODO: change later
  counts <- 
    counts |> 
    select(-c(MultiGenera))
  
  warning("The column 'MultiGenera' has been removed from the counts data frame. Please check if this is still applicable.")

  assay_counts <- 
    counts |> 
    mutate(mg_uid = sampleID) |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("mg_uid") |> 
    drop_na() |> 
    t() |> 
    pmax(0)
  
  assay_counts_corr <- 
    counts_corr |> 
    mutate(mg_uid = sampleID) |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("mg_uid") |>
    t() |> 
    pmax(0)
  
  assay_relative_ab <- 
    relabs |> 
    mutate(mg_uid = sampleID) |> 
    dplyr::select(-c(sampleID, CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("mg_uid") |>
    t() |> 
    pmax(0)
    
  
  ## colData 
  
  # We build the SE colData from the the technical metadata
  # to which we add the total number of non-human reads
  # and the cst related data
  se_coldata <- 
    technical_metadata_agg |> 
    dplyr::left_join(
      counts |> 
        mutate(mg_uid = sampleID) |> 
        select(mg_uid, CST, subCST, score) |>
        dplyr::rename(
          cst = CST,
          sub_cst = subCST,
          valencia_score = score
        ) |> 
        mutate(
          total_non_human_reads = 
            rowSums(counts |> select(-c(sampleID, CST, subCST, score)))
        ), 
      by = join_by(mg_uid)
    ) |> 
    select(uid, mg_uid, sample_type, control_type, total_non_human_reads, cst, sub_cst, valencia_score, everything()) |> 
    arrange(sample_type, uid)
  
  dictionary <- 
    dictionary |> 
    bind_rows(
      tibble(name = "total_non_human_reads", description = "Total `counts` per sample"),
      tibble(name = "cst", original_name = "CST", description = "VALENCIA CST based on metagenomic taxonomic composition"),
      tibble(name = "sub_cst", original_name = "subCST", description = "VALENCIA sub-CST based on metagenomic taxonomic composition"),
      tibble(name = "valencia_score", original_name = "score", description = "VALENCIA assignment score based on metagenomic taxonomic composition")
    )

  
  se_coldata <- se_coldata |> as.data.frame()
  
  ## rowData
  se_rowdata <- 
    tibble(
      taxon_id = 
        counts |> 
        dplyr::select(-c(sampleID, CST, subCST, score)) |>
        colnames()
    ) |> 
    dplyr::left_join(
      VIRGO2_taxonomic_table |> 
        mutate(
          taxon_id = Taxa,
          last_available_taxonomic_level = 
            Full_taxonomy |> str_remove_all(".*;") |> str_remove_all("_.*")
        ) |> 
        relocate(Full_taxonomy, .after = Species) |> 
        dplyr::rename(taxon_category = Category),
      by = join_by(taxon_id)
    ) |> 
    dplyr::left_join(
      LBP_strain_info |> 
        dplyr::rename(taxon_id = strain_id_mg),
      by = join_by(taxon_id)
    ) |> 
    mutate(
      taxon_label = 
        ifelse(!is.na(LBP), "LBP strain ", "") |> 
        str_c(taxon_id |> str_replace_all("_", " ")) |> 
        str_c(
          ifelse(
            last_available_taxonomic_level %in% c("g","f"), 
            str_c(" (", last_available_taxonomic_level, ")"), ""
          )
        ) |> 
        str_c(
          ifelse(taxon_id == "Lactobacillus_crispatus", " (native strains)", "")
        )
    ) |> 
    relocate(taxon_label, .after = taxon_id) |>
    janitor::clean_names() |>
    dplyr::rename(LBP = lbp) |> 
    as.data.frame() 
    
  # Harmonization of the order of samples and feature
  
  ordered_mg_uids <- se_coldata$mg_uid
  assay_counts <- assay_counts[, ordered_mg_uids] |> set_colnames(se_coldata$uid)
  assay_counts_corr <- assay_counts_corr[, ordered_mg_uids] |> set_colnames(se_coldata$uid)
  assay_relative_ab <- assay_relative_ab[, ordered_mg_uids] |> set_colnames(se_coldata$uid)
  se_coldata <- se_coldata |> set_rownames(se_coldata$uid)

  ordered_taxa <- se_rowdata$taxon_id
  assay_counts <- assay_counts[ordered_taxa, ]
  assay_counts_corr <- assay_counts_corr[ordered_taxa, ]
  assay_relative_ab <- assay_relative_ab[ordered_taxa, ]
  
  dictionary <- 
    dictionary |>
    mutate(
      name = 
        name |> 
        factor(
          levels = c(colnames(se_coldata),dictionary$name) |> unique()
        )
    ) |> 
    arrange(name)

  SummarizedExperiment::SummarizedExperiment(
    assays = list(counts = assay_counts, counts_corr = assay_counts_corr, rel_abs = assay_relative_ab),
    rowData = se_rowdata,
    colData = se_coldata,
    metadata = list(
      SE_creation_date = today(),
      colData_dictionary = dictionary
    )
  )
}
Code
SE_mg <- 
  mg_to_SE(
    counts = counts, counts_corr = counts_corr, relabs = relabs, 
    technical_metadata_agg = technical_metadata_agg, dictionary = dictionary,
    VIRGO2_taxonomic_table = VIRGO2_taxonomic_table, LBP_strain_info = LBP_strain_info
    )
Code
SE_mg <- 
  SE_mg |> 
  mutate(
    location = 
      case_when(
        str_detect(pid, "06810") ~ "US",
        str_detect(pid, "06820") ~ "SA",
        TRUE ~ NA_character_
      )
  )

Exploratory & QC analyses

Code
SE_mg
# A SummarizedExperiment-tibble abstraction: 751,548 × 56
# Features=778 | Samples=966 | Assays=counts, counts_corr, rel_abs
   .feature .sample        counts counts_corr rel_abs uid     mg_uid sample_type
   <chr>    <chr>           <dbl>       <dbl>   <dbl> <chr>   <chr>  <chr>      
 1 122010   068100004_0000      0           0       0 068100… MG_202 Clinical s…
 2 185329   068100004_0000      0           0       0 068100… MG_202 Clinical s…
 3 C0006A1  068100004_0000      0           0       0 068100… MG_202 Clinical s…
 4 C0022A1  068100004_0000      0           0       0 068100… MG_202 Clinical s…
 5 C0028A1  068100004_0000      0           0       0 068100… MG_202 Clinical s…
 6 C0059E1  068100004_0000      0           0       0 068100… MG_202 Clinical s…
 7 C0112A1  068100004_0000      0           0       0 068100… MG_202 Clinical s…
 8 C0175A1  068100004_0000      0           0       0 068100… MG_202 Clinical s…
 9 FF00004  068100004_0000      0           0       0 068100… MG_202 Clinical s…
10 FF00018  068100004_0000      0           0       0 068100… MG_202 Clinical s…
# ℹ 40 more rows
# ℹ 48 more variables: control_type <chr>, total_non_human_reads <dbl>,
#   cst <chr>, sub_cst <chr>, valencia_score <dbl>, swab_barcode <chr>,
#   pid <chr>, mg_pid <chr>, mg_visit_code <chr>, visit_code_flagged <chr>,
#   mg_sample_type <chr>, ext_lib_plate_nb <int>, ext_lib_plate_id <chr>,
#   ext_lib_position <chr>, sequencing_run <chr>, sequencing_date <chr>,
#   sequencing_lane <chr>, sequencing_sample_id <chr>, library_pool <chr>, …

Total number of counts/relative abundances per sample

Total number of non-human reads per sample

Code
SE_mg |> 
  colData() |> 
  as_tibble() |>
  ggplot() +
  aes(x = total_non_human_reads, fill = library_sequenced_twice) +
  geom_vline(xintercept = 500000, linetype = "dashed", color = "black") +
  geom_histogram(bins = 50) +
  scale_x_log10() +
  facet_grid(sample_type ~ ., scale = "free") +
  labs(
    x = "Total number of non-human reads per sample", 
    y = "Number of samples"
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
SE_mg |> 
  colData() |> 
  as_tibble() |>
  ggplot() +
  aes(x = total_non_human_reads, fill = selected_for_re_extraction) +
  geom_vline(xintercept = 500000, linetype = "dashed", color = "black") +
  geom_histogram(bins = 50) +
  scale_x_log10() +
  facet_grid(sample_type ~ ., scale = "free") +
  labs(
    x = "Total number of non-human reads per sample", 
    y = "Number of samples"
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
SE_mg |> 
  colData() |> 
  as_tibble()  |> 
  arrange(total_non_human_reads) |> 
  mutate(index = row_number()) |>
  select(index, uid, total_non_human_reads, sample_type, control_type, library_sequenced_twice, selected_for_re_extraction, sub_cst) |> 
  filter(total_non_human_reads < 1e6) |>
  gt()
index uid total_non_human_reads sample_type control_type library_sequenced_twice selected_for_re_extraction sub_cst
1 EQ05822476 2 Negative control Nuclease-free water FALSE FALSE IV-A
2 068200113_0000 107164 Clinical sample FALSE TRUE III-B
3 EQ05822493 111049 Negative control Unused swab + C2 FALSE FALSE III-B
4 068100026_2120 122821 Clinical sample FALSE FALSE III-B
5 EQ05822499 122897 Negative control Nuclease-free water FALSE FALSE IV-B
6 068200050_1900 211151 Clinical sample FALSE TRUE III-B
7 068200422_2120 284381 Clinical sample FALSE TRUE III-B
8 068100062_1100 303568 Clinical sample FALSE TRUE IV-B
9 068200361_1300 380193 Clinical sample FALSE TRUE IV-B
10 068200062_1700 425144 Clinical sample TRUE TRUE III-B
11 068200204_1900 429267 Clinical sample FALSE TRUE IV-C1
12 068100052_1000 446679 Clinical sample FALSE TRUE III-B
13 068100062_2120 469552 Clinical sample FALSE FALSE III-A
14 068200399_1200 528459 Clinical sample FALSE TRUE III-B
15 068100049_1100 548033 Clinical sample FALSE TRUE III-B
16 068200387_2120 549857 Clinical sample FALSE FALSE III-B
17 068200361_1900 580999 Clinical sample TRUE TRUE IV-A
18 068100029_1100 618230 Clinical sample FALSE TRUE IV-B
19 EQ05822501 635475 Negative control Nuclease-free water FALSE FALSE III-B
20 068200204_2120 722162 Clinical sample FALSE TRUE I-B
21 068200399_1300 725940 Clinical sample FALSE TRUE III-B
22 068200399_1900 733772 Clinical sample FALSE TRUE IV-B
23 068100014_0000 767569 Clinical sample FALSE TRUE III-A
24 068200127_1000 795144 Clinical sample TRUE TRUE IV-C1
25 068200285_0000 805893 Clinical sample FALSE TRUE IV-B
26 068100061_1100 819924 Clinical sample TRUE TRUE IV-B
27 068100049_1000 822383 Clinical sample FALSE FALSE III-B
28 068200361_1400 846815 Clinical sample FALSE FALSE IV-B
29 EQ05822500 857771 Negative control Unused swab + C2 FALSE FALSE III-B
30 068200259_0000 866059 Clinical sample FALSE FALSE IV-B
31 068100016_1100 878598 Clinical sample FALSE TRUE III-A
32 068100062_0000 882023 Clinical sample FALSE FALSE IV-B
33 068100062_1400 925127 Clinical sample FALSE FALSE IV-B
34 068200388_1200 930056 Clinical sample FALSE TRUE IV-B
35 068100026_1100 932868 Clinical sample FALSE FALSE III-B
36 068200239_1900 958264 Clinical sample FALSE TRUE III-B
37 068200247_1100 996799 Clinical sample TRUE FALSE IV-B
Code
SE_mg |> 
  colData() |> 
  as_tibble() |>
  filter(sample_type == "Clinical sample", visit_code %in% c(seq(1000, 1900, by = 100), 2120)) |> 
  ggplot() +
  aes(x = total_non_human_reads, fill = selected_for_re_extraction) +
  geom_vline(xintercept = 500000, linetype = "dashed", color = "black") +
  geom_histogram(bins = 50) +
  scale_x_log10() +
  facet_grid(visit_code ~ ., scale = "free") +
  labs(
    x = "Total number of non-human reads per sample", 
    y = "Number of samples"
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Total relative abundances per sample

We check that the relative abundances sum to 1 for each sample.

Code
tol <- 1e-5

SE_mg |> 
  as_tibble() |> 
  group_by(.sample) |>
  summarise(total_rel_abs = sum(rel_abs)) |> 
  ggplot(aes(x = total_rel_abs)) +
  geom_histogram(binwidth = tol) + 
  labs(x = "Total relative abundances per sample", 
       y = "Number of samples") +
  scale_x_continuous(limits = 1 + c(-1, 1) * 5 * tol) 

Code
rm(tol)
Code
# We check the relative abundances computation (should be `counts_corr/sum(counts_corr)`).

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  group_by(.sample) |>
  mutate(rel_abs_check = counts_corr/sum(counts_corr)) |> 
  ungroup() 

if (max(tmp$rel_abs_check - tmp$rel_abs) > 0.01) warning("!! Relative abundances do not match the computed values !!")

Non-bacterial DNA

There are some non-bacterial taxa in the dataset:

Code
SE_mg |> 
  as_tibble() |> 
  filter(taxon_category != "Bacteria") |> 
  group_by(.sample, taxon_category) |> 
  summarize(rel_abs = sum(rel_abs), .groups = "drop") |> 
  ggplot() +
  aes(x = rel_abs, fill = taxon_category) +
  geom_histogram(binwidth = 0.001) +
  facet_wrap(~ taxon_category, scales = "free") +
  scale_x_continuous("Total relative abundance per sample for that group", labels = scales::percent_format()) +
  ylab("Number of samples") +
  guides(fill = "none")

Code
SE_mg |> 
  as_tibble() |> 
  filter(taxon_category != "Bacteria") |> 
  group_by(.sample) |>
  mutate(total_rel_ab = sum(rel_abs)) |>
  ungroup() |> 
  arrange(-total_rel_ab, -rel_abs) |> 
  mutate(
    .sample = .sample |> fct_inorder(),
    taxon_label = taxon_label |> fct_inorder()
  ) |> 
  filter(as.numeric(.sample) <= 20) |>
  ggplot() +
  aes(x = rel_abs, y = .sample |> fct_rev(), fill = taxon_category) +
  geom_col() +
  scale_x_continuous("Relative abundance", labels = scales::percent_format()) +
  ylab("Top 20 samples with most non-bacterial relative abundances") 

Code
SE_mg |> 
  as_tibble() |> 
  filter(taxon_category != "Bacteria") |> 
  group_by(taxon_label) |>
  mutate(total_rel_ab = sum(rel_abs), max_rel_ab = max(rel_abs)) |>
  ungroup() |> 
  arrange(-total_rel_ab, -rel_abs) |> 
  mutate(
    .sample = .sample |> fct_inorder(),
    taxon_label = taxon_label |> fct_inorder()
  ) |> 
  filter(as.numeric(taxon_label) <= 20, max_rel_ab > 0.001) |>
  ggplot() +
  aes(x = rel_abs, y = taxon_label |> fct_rev(), color = taxon_category) +
  geom_point(alpha = 0.3) +
  scale_x_continuous("Relative abundance", labels = scales::percent_format()) +
  ylab("Top non-bacterial organisms") +
  theme(strip.text.y = element_text(angle = 0)) +
  labs(caption = "Each dot is a sample.")

Consequently, we create a new assay which contains the relative abundances of the bacteria only.

Code
SE_mg <- 
  SE_mg |>
  mutate(
    rel_abs_bact = rel_abs * (!is.na(taxon_category) & (taxon_category == "Bacteria"))
  )

assay(SE_mg, "rel_abs_bact") <- t(t(assay(SE_mg, "rel_abs_bact"))/colSums(assay(SE_mg, "rel_abs_bact")))

Control samples

There are 4 categories of control samples:

Code
SE_mg |> 
  colData() |> 
  as_tibble() |> 
  filter(sample_type != "Clinical sample") |>
  select(control_type, uid, ext_lib_plate_nb) |> 
  group_by(control_type) |> 
  arrange(control_type, ext_lib_plate_nb) |>
  gt(caption = "Control samples", row_group_as_column = TRUE) 
Control samples
uid ext_lib_plate_nb
98120009_1010 5
98120009_1020 5
98120009_1030 5
98120009_1040 5
98120009_1050 5
98120009_1060 5
98120009_1070 5
Mock 1 EQ05822515 1
EQ05822481 5
EQ05822509 8
EQ05822503 10
EQ05822445 11
Mock 2 EQ05822227 1
EQ05822473 3
EQ05822497 10
EQ05822439 11
Nuclease-free water EQ05822226 1
EQ05822476 3
EQ05822499 5
EQ05822501 8
EQ05822491 10
Unused swab + C2 EQ05822202 1
EQ05822493 5
EQ05822500 8
EQ05822485 10

These control samples have been created such that

  • Mock 1: pooled patient samples from CAP084 swabs

    • 50 non-BV
      • VMRC 15 L.crispatus isolates
      • 12 L.crispatus dominant samples
      • 23 L. iners dominant samples
    • 50 BV
      • 24 G. vaginalis dominant samples
      • BV bacteria, not G. vaginalis dominant
  • Mock 2: Pooled pure isolates (equal volumes of pure isolates at OD 0.1)

    • VMRC 15 L.crispatus isolates
    • G. vaginalis ATCC
    • A. vaginae ATCC
    • P. bivia ATCC
    • L. crispatus ATCC
    • L. gasseri ATCC
    • S. agalactiae ATCC
    • L. jensenii ATCC
    • F. magna

Taxonomic composition of these control samples:

Code
control_types <- SE_mg$control_type |> unique() |> sort() |> extract(-1)
Code
map(
  control_types,
  ~ {
    tmp <- 
      SE_mg |> 
      as_tibble() |> 
      filter(control_type == .x) |> 
      filter(rel_abs > 0) 
    top_taxa <- 
      tmp |> 
      group_by(taxon_label) |> 
      summarise(total_relabs = sum(rel_abs)) |> 
      arrange(desc(total_relabs)) |> 
      slice_head(n = 25) |> 
      pull(taxon_label) |> sort()
    
    all_taxa <- tmp$taxon_label |> unique() |> sort()
    
    tmp |> 
      mutate(
        sample_label = str_c(.sample, ifelse(selected_for_re_extraction, " (re-x)", ""))
      ) |> 
      ggplot() +
      aes(y = rel_abs, x = sample_label, fill = taxon_label) +
      geom_col(color = "white", linewidth = 0.1) +
      geom_point(aes(y = 1.1, size = log10(total_non_human_reads))) +
      scale_x_discrete("Samples") +
      scale_y_continuous("Rel. ab.") +
      scale_fill_discrete(
        "Top taxa", breaks = top_taxa #, limits = all_taxa, values = get_taxa_colors(all_taxa)
        ) + 
      facet_grid(. ~ str_c("Plate ", ext_lib_plate_nb), scales = "free") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      labs(title = str_c("Control samples : ", .x))
  }
)
[[1]]


[[2]]


[[3]]


[[4]]

Mock 1

For Mock 1 samples, the relative abundances are not very replicable across plates, but that seems to correlate with lower total number of non-human reads.

We check below if detection (presence/absence) is better and better correlate within Mocks than across Mocks.

Code
tmp <- 
  SE_mg |> 
  as_tibble() |> 
  filter(str_detect(control_type, "Mock")) |> 
  mutate(present = (rel_abs > 1/1000)) |> 
  group_by(control_type, taxon_label) |>
  mutate(tot_prop = sum(rel_abs)) |> 
  ungroup() |> 
  arrange(control_type, -tot_prop) |> 
  mutate(taxon_label = taxon_label |> fct_inorder()) 
Code
tmp |> 
  ggplot() +
  aes(x = taxon_label, y = .sample, fill = rel_abs |> log10()) +
  geom_tile() +
  ylab("") +
  scale_x_discrete("Taxa", breaks = NULL) +
  scale_fill_gradient(low = "gray99", high = "red", na.value = "white") +
  facet_grid(control_type ~ ., scales = "free", space = "free")

It looks acceptable.

Mock 2

In the Mock 2, we check for the proportion of species/taxa that were not expected based on the “theoretical” composition of Mock 2

Code
# Focus on Mock2

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  filter(control_type == "Mock 2") |> 
  filter(rel_abs > 0) |> 
  mutate(
    expected = 
      str_detect(taxon_label, "LBP") | 
      str_detect(taxon_label, "Gardnerella") |
      str_detect(taxon_label, "Fannyhessea") |
      str_detect(taxon_label, "bivia") |
      str_detect(taxon_label, "crispatus") |
      str_detect(taxon_label, "gasseri") |
      str_detect(taxon_label, "jensenii") |
      str_detect(taxon_label, "Streptococcus agalactiae") |
      str_detect(taxon_label, "Finegoldia magna") 
  )

tmp |> 
  ggplot() +
  aes(y = rel_abs, x = .sample, fill = expected) +
  geom_col(color = "black", linewidth = 0.1) +
  scale_x_discrete("Samples") +
  scale_y_continuous("Rel. ab.") + 
  facet_grid(. ~ str_c("Plate ", ext_lib_plate_nb), scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
tmp |> 
  group_by(.sample, ext_lib_plate_nb, expected) |> 
  summarize(tot_rel_ab_pc = round(100 * sum(rel_abs), 2), .groups = "drop") |>
  filter(!expected) |> 
  gt(caption = "Percent of relative abundance with unexpected taxa")
Percent of relative abundance with unexpected taxa
.sample ext_lib_plate_nb expected tot_rel_ab_pc
EQ05822227 1 FALSE 4.53
EQ05822439 11 FALSE 3.53
EQ05822473 3 FALSE 1.83
EQ05822497 10 FALSE 3.13

When only considering bacterial relative abundances, results look similar:

Code
# Focus on Mock2

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  filter(control_type == "Mock 2") |> 
  filter(rel_abs_bact > 0) |> 
  mutate(
    expected = 
      str_detect(taxon_label, "LBP") | 
      str_detect(taxon_label, "Gardnerella") |
      str_detect(taxon_label, "Fannyhessea") |
      str_detect(taxon_label, "bivia") |
      str_detect(taxon_label, "crispatus") |
      str_detect(taxon_label, "gasseri") |
      str_detect(taxon_label, "jensenii") |
      str_detect(taxon_label, "Streptococcus agalactiae") |
      str_detect(taxon_label, "Finegoldia magna") 
  )

tmp |> 
  ggplot() +
  aes(y = rel_abs_bact, x = .sample, fill = expected) +
  geom_col(color = "black", linewidth = 0.1) +
  scale_x_discrete("Samples") +
  scale_y_continuous("Rel. ab. (of bacterial content)") + 
  facet_grid(. ~ str_c("Plate ", ext_lib_plate_nb), scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
tmp |> 
  group_by(.sample, ext_lib_plate_nb, expected) |> 
  summarize(tot_rel_ab_pc = round(100 * sum(rel_abs_bact), 2), .groups = "drop") |>
  filter(!expected) |> 
  gt(caption = "Percent of relative abundance with unexpected taxa")
Percent of relative abundance with unexpected taxa
.sample ext_lib_plate_nb expected tot_rel_ab_pc
EQ05822227 1 FALSE 4.52
EQ05822439 11 FALSE 3.50
EQ05822473 3 FALSE 1.82
EQ05822497 10 FALSE 3.13

However, when examining the list of unexpected taxa, it seems that the “unexpectedness” can likely be attributed to shared genes that were assigned to a single taxa in VIRGO2:

Code
tmp |> 
  filter(!expected) |> 
  arrange(.sample, -rel_abs_bact) |> 
  group_by(.sample) |> 
  slice_head(n = 20) |> 
  ungroup() |> 
  arrange(-rel_abs_bact) |> 
  mutate(
    taxon_label = taxon_label |> fct_inorder() |> fct_rev(),
    expected = 
      case_when(
        str_detect(taxon_label, "actobacillus|Prevotella|Streptococcus|Finegoldia") ~ "Likely shared genes",
        TRUE ~ "??? contamination"
      )
    ) |>
  ggplot() +
  aes(x = .sample, y = taxon_label, size = rel_abs_bact, col = expected) +
  geom_point() +
  scale_y_discrete("Top 20 unexpected taxa") +
  scale_color_manual("", values = c("red", "steelblue1")) +
  xlab("Sample") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Alpha-diversity of clinical samples and controls

Code
vegan::diversity(SE_mg |> assay("rel_abs"), index = "shannon", MARGIN = 2) |> 
  as_tibble() |> 
  mutate(uid = colnames(SE_mg)) |> 
  dplyr::left_join(SE_mg |> colData() |> as_tibble(), by = join_by(uid)) |> 
  ggplot() +
  aes(x = control_type, y = value |> exp(), col = sample_type, fill = sample_type) +
  facet_grid(. ~ sample_type, scales = "free", space = "free") +
  geom_violin(scale = "width", color = NA, alpha = 0.2) +
  ggbeeswarm::geom_quasirandom(size = 0.5, alpha = 0.5) +
  scale_y_log10() +
  labs(
    x = "",
    y = "Effective number of species\nbased on shannon diversity index"
  ) +
  guides(fill = "none", color = "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Negative controls have lower total reads and lower alpha-diversity.

Proportion of LBP strains per sample

Do we detect any LBP strain?

Code
SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP)) |> 
  ggplot() +
  aes(x = rel_abs, fill = (rel_abs == 0)) +
  geom_histogram(binwidth = 0.01) +
  scale_fill_manual("", values = c("steelblue", "gray"), labels = c("Rel. ab = 0", "Rel. ab > 0")) +
  xlab("Relative abundance of LBP strains") +
  ylab("Number of samples x LBP strain")

Code
SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP), rel_abs > 0) |> 
  ggplot() +
  aes(x = rel_abs) +
  geom_histogram(binwidth = 0.001) +
  xlab("Non-zero relative abundance of LBP strains") +
  ylab("Number of samples") +
  facet_grid(LBP + .feature ~ .) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP), rel_abs > 0) |>
  ggplot() +
  aes(x = .feature) +
  geom_bar() +
  scale_x_discrete("LBP strains") +
  scale_y_continuous("Number of samples with non-zero relative abundances") +
  facet_grid(. ~ LBP, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Code
number_of_non_zero_LBP_strains_per_sample <- 
  SE_mg |> 
  as_tibble() |>  
  filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    number_of_non_zero_LBP_strains = sum(rel_abs > 0),
  ) 

number_of_non_zero_LBP_strains_per_sample |> 
  ggplot() +
  aes(x = number_of_non_zero_LBP_strains) +
  geom_histogram(binwidth = 0.5) +
  scale_x_continuous("Number of non-zero LBP strains per sample", breaks = 0:15, minor_breaks = 0) +
  ylab("Number of samples") 

There are 234 samples (24.22%) with at least one LBP strain detected (non-zero relative abundance).

Code
SE_mg |> 
  as_tibble() |>  
  filter(!is.na(LBP)) |> 
  dplyr::left_join(
    number_of_non_zero_LBP_strains_per_sample,
    by = join_by(.sample)
  ) |>
  filter(number_of_non_zero_LBP_strains != 0) |>
  group_by(.feature) |>
  mutate(tot_rel_abs_for_strain = sum(rel_abs)) |> 
  ungroup() |> 
  arrange(LBP, -tot_rel_abs_for_strain) |>
  mutate(.feature = .feature |> fct_inorder()) |> 
  group_by(.sample) |> 
  mutate(
    LBP_rel_abs = rel_abs / sum(rel_abs),
    score = weighted.mean(LBP_rel_abs, .feature |> as.numeric()),
    total_LBP_rel_abs = sum(rel_abs)
  ) |> 
  ungroup() |> 
  arrange(score) |> 
  mutate(.sample = .sample |> fct_inorder()) |> 
  ggplot() +
  aes(x = .feature, y = .sample, fill = rel_abs) +
  geom_tile() +
  scale_fill_continuous("Rel. ab", low = "white", high = "steelblue2") +
  scale_x_discrete("Strains") +
  scale_y_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ LBP + strain_origin, scales = "free_x", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(
    caption = "Rel. ab. of LBP strains in samples with at least one LBP strain detected (non-zero)",
  )

Compositions

Code
# creating a taxa category "other"
summarized_rel_abs <- 
  SE_mg |> 
  as_tibble() |> 
  group_by(.feature) |> 
  mutate(max_rel_ab = max(rel_abs), mean_rel_ab = mean(rel_abs)) |> 
  ungroup() |> 
  mutate(
    taxa = 
      case_when(
        !is.na(LBP) ~ taxon_label,
        max_rel_ab > 1/3 ~ taxon_label,
        TRUE ~ "Other"
      ),
    is_lacto = str_detect(genus, "Lactobacillus") & (taxa != "Other")
  ) |> 
  arrange(LBP, is_lacto, desc(max_rel_ab)) |>
  mutate(
    taxa = taxa |> fct_inorder()
  ) |> 
  group_by(.sample, sample_type, control_type, sample_category, location, pid, visit_code, taxa, LBP, is_lacto, strain_origin) |>
  summarize(rel_abs = sum(rel_abs), .groups = "drop") |>
  arrange(.sample, -rel_abs) |> 
  group_by(.sample) |>
  mutate(
    score = weighted.mean(taxa |> as.numeric(), rel_abs), 
    tot = sum(rel_abs),
    dom = taxa[1]
    ) |> 
  ungroup() |> 
  arrange(score) |> 
  mutate(.sample = .sample |> fct_inorder())
Code
summarized_rel_abs |> 
  ggplot() +
  aes(x = .sample, y = rel_abs, fill = taxa) +
  geom_col() +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ sample_type + str_wrap(sample_category, 50), scales = "free", space = "free") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_abs$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 90, hjust = 0),
    legend.position = "bottom",
  ) +
  ggtitle("Taxonomic composition of all samples")

Code
summarized_rel_abs |> 
  filter(sample_category != "Expected sample") |> 
  ggplot() +
  aes(x = .sample, y = rel_abs, fill = taxa) +
  geom_col() +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ str_c(sample_type,"\n", control_type) + str_wrap(sample_category, 20), scales = "free", space = "free") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_abs$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 90, hjust = 0),
    legend.position = "bottom",
  ) +
  ggtitle('"Unexpected" samples')

Code
summarized_rel_abs |> 
  filter(sample_type == "Clinical sample") |> # 
  ggplot() +
  aes(x = pid, y = rel_abs, fill = taxa) +
  geom_col() +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location + sample_category, scales = "free_x", space = "free_x") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_abs$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "bottom",
  ) +
  ggtitle("Longitudinal profiles for all clinical samples")

Code
summarized_rel_abs |> 
  filter(sample_type == "Clinical sample", sample_category %in% c("Expected sample")) |> # 
  ggplot() +
  aes(x = pid, y = rel_abs, fill = taxa) +
  geom_col() +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location, scales = "free_x", space = "free_x") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_abs$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "bottom",
  ) +
  ggtitle('Longitudinal profiles for all "expected" clinical samples')

Code
summarized_rel_abs |> 
  filter(sample_type == "Clinical sample", visit_code == "0000") |> # 
  group_by(.sample) |> mutate(any_LBP = sum(rel_abs[!is.na(LBP)]) > 0) |> ungroup() |> 
  filter(any_LBP) |> 
  ggplot() +
  aes(x = pid, y = rel_abs, fill = taxa) +
  geom_col(color = "white") +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location + sample_category, scales = "free_x", space = "free_x") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_abs$taxa |> levels()),
    guide = guide_legend(nrow = 15)
    ) +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) +
  ggtitle('Participants with LBP strains at screening visit')

Code
summarized_rel_abs |> 
  filter(sample_type == "Clinical sample", visit_code == "0000") |> # 
  group_by(.sample) |> mutate(any_LBP = sum(rel_abs[!is.na(LBP)]) > 0) |> ungroup() |> 
  filter(any_LBP) |> 
  ggplot() +
  aes(x = pid, y = rel_abs, fill = strain_origin) +
  geom_col(color = "white") +
  scale_x_discrete("Participant") + # , breaks = NULL) +
  facet_grid(visit_code ~ location + sample_category, scales = "free_x", space = "free_x") +
  theme(
    strip.text.x = element_text(angle = 0, hjust = 0.5),
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) +
  ggtitle('Participants with LBP strains at screening visit')

PCoA on counts

Code
SE_mg <- 
  SE_mg |> 
  mutate(
    combined_sample_type = str_c(sample_type," (", control_type,")") |> str_remove(" \\(\\)")
  )
Code
dist_matrix <-
  vegdist(
    assay(SE_mg, "counts") |> t(), 
    method = "bray")

pcoa <- wcmdscale(dist_matrix, eig = TRUE)
print(pcoa)
Call: wcmdscale(d = dist_matrix, eig = TRUE)

          Inertia Rank
Total      359.39     
Real       428.63  344
Imaginary  -69.24  621

Results have 966 points, 344 axes

Eigenvalues:
  [1] 60.97 50.39 33.62 21.71 15.98 13.65 12.50 10.89  9.51  8.84  8.08  6.79
 [13]  6.25  6.08  5.51  4.77  4.59  4.39  4.38  4.21  4.07  3.73  3.61  3.32
 [25]  3.15  3.07  2.94  2.81  2.69  2.58  2.46  2.45  2.32  2.27  2.20  2.10
 [37]  2.04  1.96  1.87  1.80  1.71  1.62  1.56  1.54  1.52  1.44  1.40  1.38
 [49]  1.32  1.29  1.24  1.21  1.18  1.17  1.12  1.10  1.08  1.07  1.05  1.03
 [61]  1.00  0.97  0.95  0.93  0.89  0.88  0.87  0.85  0.82  0.80  0.79  0.77
 [73]  0.75  0.74  0.72  0.72  0.71  0.68  0.67  0.66  0.64  0.63  0.62  0.61
 [85]  0.60  0.59  0.58  0.57  0.55  0.54  0.53  0.52  0.51  0.51  0.50  0.49
 [97]  0.48  0.48  0.47  0.46  0.45  0.45  0.44  0.43  0.42  0.42  0.41  0.41
[109]  0.40  0.40  0.39  0.39  0.38  0.38  0.37  0.36  0.36  0.35  0.34  0.34
(Showing 120 of 965 eigenvalues)

Weights: Constant
Code
pcoa$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa$eig/sum(pcoa$eig)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 60.975 16.966
2 50.394 14.022
3 33.616 9.353
4 21.708 6.040
5 15.980 4.446
6 13.648 3.798
7 12.496 3.477
8 10.893 3.031
9 9.509 2.646
10 8.836 2.459
Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x = "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  filter(axis <= 20) |> 
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x = "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_data <- 
  tibble(
    uid = rownames(pcoa$points),
    PCoA = pcoa$points
  ) |> 
  dplyr::left_join(
    SE_mg |> colData() |> as_tibble(),
    by = join_by(uid)
  )
Code
pcoa_plot <- function(pcoa_tb, pcoa, color_var, axes = 1:2){
  
  pcoa_tb <- 
    pcoa_tb |> 
    mutate(
      col = as.factor(!!sym(color_var)),
      x = PCoA[, axes[1]],
      y = PCoA[, axes[2]]
      )

  pcoa_tb |>
    ggplot() +
    aes(x = x, y = y, color = col) +
    geom_point(size = 2, alpha = 0.7) +
    coord_fixed() +
    labs(
      x = paste0("PCoA ", axes[1], " (", round(100 * pcoa$eig[axes[1]] / sum(pcoa$eig), 1), "%)"),
      y = paste0("PCoA ", axes[2], " (", round(100 * pcoa$eig[axes[2]] / sum(pcoa$eig), 1), "%)"),
      color = color_var
    ) 
}

Type of sample

Code
sample_type_colors <- c("gray","gray85", "dodgerblue", "black", "purple", "red")

pcoa_plot(pcoa_data, pcoa, "combined_sample_type", c(1, 2)) + scale_color_manual(values = sample_type_colors) 

Code
pcoa_plot(pcoa_data, pcoa, "combined_sample_type", c(3, 4)) + scale_color_manual(values = sample_type_colors) 

Code
manova <- adonis2(dist_matrix ~ pcoa_data$sample_type)
manova

Extraction Plate

Code
pcoa_plot(pcoa_data, pcoa, "ext_lib_plate_nb", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "ext_lib_plate_nb", c(3, 4))

Permutation test

Code
manova <- adonis2(dist_matrix ~ pcoa_data$ext_lib_plate_nb)
manova

Selected for re-extraction

Code
pcoa_plot(pcoa_data, pcoa, "selected_for_re_extraction", c(1, 2)) 

Code
pcoa_plot(pcoa_data, pcoa, "selected_for_re_extraction", c(3, 4)) 

Bioinformatics Processing Batch

Code
pcoa_plot(pcoa_data, pcoa, "bioinformatics_processing_batch", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "bioinformatics_processing_batch", c(3, 4))

Library Pool

Code
pcoa_plot(pcoa_data, pcoa, "library_pool", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "library_pool", c(3, 4))

Lane

Code
pcoa_plot(pcoa_data |> filter(sequencing_lane %in% 1:4), pcoa, "sequencing_lane", c(1, 2))

Code
pcoa_plot(pcoa_data |> filter(sequencing_lane %in% 1:4), pcoa, "sequencing_lane", c(3, 4))

PCoA on relative abundances

Code
dist_matrix_relab <-
  vegdist(
    assay(SE_mg, "rel_abs_bact") |> t(), 
    method = "bray")


pcoa_relab <- wcmdscale(dist_matrix_relab, eig = TRUE)
print(pcoa_relab)
Call: wcmdscale(d = dist_matrix_relab, eig = TRUE)

          Inertia Rank
Total      309.96     
Real       382.05  324
Imaginary  -72.08  641

Results have 966 points, 324 axes

Eigenvalues:
  [1] 71.73 59.23 39.17 19.71 15.34 12.08 10.45  8.64  7.37  6.55  5.78  5.42
 [13]  5.25  4.79  4.31  4.24  3.84  3.76  3.23  3.15  2.97  2.79  2.68  2.57
 [25]  2.24  2.16  2.07  1.94  1.78  1.73  1.59  1.54  1.52  1.41  1.37  1.35
 [37]  1.27  1.23  1.18  1.16  1.11  1.08  1.05  1.03  0.98  0.94  0.93  0.91
 [49]  0.88  0.86  0.82  0.80  0.79  0.78  0.76  0.75  0.73  0.70  0.69  0.67
 [61]  0.66  0.64  0.63  0.62  0.59  0.56  0.56  0.55  0.54  0.52  0.51  0.49
 [73]  0.49  0.48  0.47  0.47  0.46  0.44  0.43  0.42  0.42  0.41  0.40  0.39
 [85]  0.39  0.38  0.37  0.37  0.36  0.35  0.35  0.35  0.33  0.33  0.32  0.32
 [97]  0.31  0.30  0.30  0.29  0.28  0.28  0.27  0.27  0.26  0.26  0.26  0.25
[109]  0.25  0.24  0.24  0.24  0.23  0.23  0.23  0.22  0.22  0.21  0.21  0.21
(Showing 120 of 965 eigenvalues)

Weights: Constant
Code
pcoa_relab$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa_relab$eig/sum(pcoa_relab$eig)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa_relab$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa_relab$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 71.727 23.141
2 59.234 19.110
3 39.165 12.635
4 19.712 6.359
5 15.339 4.949
6 12.079 3.897
7 10.448 3.371
8 8.639 2.787
9 7.372 2.378
10 6.552 2.114
Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  filter(axis <= 20) |> 
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_relab_data <- 
  tibble(
    uid = rownames(pcoa_relab$points),
    PCoA = pcoa_relab$points
  ) |> 
  dplyr::left_join(
    SE_mg |> colData() |> as_tibble(),
    by = join_by(uid)
  )

Type of sample

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "combined_sample_type", c(1, 2)) + scale_color_manual(values = sample_type_colors) 

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "combined_sample_type", c(3, 4)) + scale_color_manual(values = sample_type_colors) 

Code
manova <- adonis2(dist_matrix_relab ~ pcoa_relab_data$sample_type)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ pcoa_relab_data$sample_type)
          Df SumOfSqs    R2      F Pr(>F)    
Model      3     3.41 0.011 3.5669  0.001 ***
Residual 962   306.55 0.989                  
Total    965   309.96 1.000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Extraction Plate

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "ext_lib_plate_nb", c(1, 2))

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "ext_lib_plate_nb", c(3, 4))

Permutation test

Code
manova <- adonis2(dist_matrix_relab ~ pcoa_relab_data$ext_lib_plate_nb)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ pcoa_relab_data$ext_lib_plate_nb)
          Df SumOfSqs    R2      F Pr(>F)    
Model      1    4.339 0.014 13.686  0.001 ***
Residual 964  305.625 0.986                  
Total    965  309.964 1.000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Selected for re-extraction

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "selected_for_re_extraction", c(1, 2)) 

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "selected_for_re_extraction", c(3, 4)) 

Library Pool

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "library_pool", c(1, 2))

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "library_pool", c(3, 4))

Lane

Code
pcoa_plot(pcoa_relab_data |> filter(sequencing_lane %in% 1:4), pcoa_relab, "sequencing_lane", c(1, 2))

Code
pcoa_plot(pcoa_relab_data |> filter(sequencing_lane %in% 1:4), pcoa_relab, "sequencing_lane", c(3, 4))

Visit

Code
tmp <- 
  pcoa_relab_data |> 
  filter(
    sample_type == "Clinical sample", 
    visit_code %in% c(seq(1000, 1900, by =100), 2120)
    )

v_colors <- c("red", "green3", colorRampPalette(c("dodgerblue1", "dodgerblue4","purple"))(7))
  

pcoa_plot(tmp, pcoa_relab, "visit_code", c(1, 2)) +  scale_color_manual(values = v_colors) 

Code
pcoa_plot(tmp, pcoa_relab, "visit_code", c(3,4)) +  scale_color_manual(values = v_colors) 

Saving SummarizedExperiment objects

We save the SE objects to disk

Code
saveRDS(
  SE_mg, 
  str_c(
    get_output_dir(data_source = data_source),  
    "02_se_mg_", today() |> str_remove_all("-"), ".rds"
    )
  )